# Graph data preparation #
# This script computes APC-values for mean, divergence, and Sorting as described in the manuscript, while holding other predictors at their means
# These computations differ for the Sorting models, where we are mostly concerned with interaction effects
# Note that we are working with a long to wide format. Basically, we compute a host of additional columns which we use for rowwise computation

# Load packages and install them if necessary (automatic)
need = c(
  "dplyr",           #data structuring/cleaning
  "tidyr"            #data structuring/cleaning
)     
have = need %in% rownames(installed.packages())
if(any(!have)) {install.packages(need[!have])}
pack <- lapply(need, library, character.only = TRUE)
# Clean environment
rm(have, need, pack)

# Load data
load(file = "D:/dv_rep/Datasets/gss_Boot.Rdata")

# Add grouping variable to indicate whether a term is an interaction (:) or not
gss_Boot <- gss_Boot %>%
  mutate(interaction = ifelse(startsWith(term, ":"), "Interaction", "Main")) %>%
  mutate(intgroup = interaction(group, interaction)) %>%
  select(!interaction)

# First, let's compute the effects by frequency
dat_Plot <- gss_Boot %>%
  rowwise() %>%
  mutate(eff = coef*(Freq * 100)) %>%
  group_by(var, controls, facet, boot, intgroup) %>%
  mutate(effect = sum(eff) / 100) %>%
  ungroup() %>%
  as.data.frame()

# Extract constant (Intercept) by model
dat_Plot <- dat_Plot %>%
  group_by(var, controls, boot, facet) %>%
  mutate(constant = as.numeric(paste(coef[term == "Intercept"]))) %>%
  ungroup() %>%
  as.data.frame()

# Compute mean effects for APC factors and covariates
# Note: we are only concerned with the interaction effects for Sorting
dat_Plot <- dat_Plot %>%
  group_by(var, controls, boot, facet) %>%
  mutate(meanYear    = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "Year.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "Year.Main"])))),
         meanCohort  = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "Cohort.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "Cohort.Main"])))),
         meanAge     = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "Age.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "Age.Main"])))),
         meanWhite   = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "White.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "White.Main"])))),
         meanCollege = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "College.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "College.Main"])))),
         meanProt    = case_when(facet == "Sorting" ~ as.numeric(paste(mean(effect[intgroup %in% "Protestant.Interaction"]))),
                                 TRUE ~ as.numeric(paste(mean(effect[intgroup %in% "Protestant.Main"]))))
  ) %>%
  ungroup() %>%
  as.data.frame()

# Impute 0 values for models that do not have control variables 
# Note: we do so because we add 0 (i.e. doesn't change values for the models without control variables)
dat_Plot <- dat_Plot %>% mutate_at(vars(meanWhite, meanCollege, meanProt), ~ifelse(is.na(.), 0, .))

# Extract main effect for Sorting (Variable), this will be similar to the intercept in the other models
dat_Plot <- dat_Plot %>%
  group_by(var, controls, boot, facet) %>%
  mutate(main_effect = ifelse(any(term == "Variable"), as.numeric(paste(coef[term == "Variable"])), NA)) %>%
  ungroup() %>%
  as.data.frame()

# We can also compute values for the reference categories, which are obviously missing in the data frame (regression output)
# First, extract intercepts or variable (again), change them to contain the reference categories, and re-add them to compute reference categories
# Set coef to 0 since we'll add constant (intercept) or variable (main effect) later to coef
int_Cohort <- dat_Plot %>%
  filter((facet == "Sorting" & term == "Variable") | (facet != "Sorting" & term == "Intercept")) %>%
  mutate(
    coef  = 0,
    term  = "Boomer",
    group = "Cohort"
  ) %>%
  mutate(term = case_when(
    facet == "Sorting" ~ ":Boomer",
    TRUE ~ term
  ))

# Repeat for age
int_Age <- dat_Plot %>%
  filter((facet == "Sorting" & term == "Variable") | (facet != "Sorting" & term == "Intercept")) %>%
  mutate(
    coef  = 0,
    term  = "30-64",
    group = "Age"
  ) %>%
  mutate(term = case_when(
    facet == "Sorting" ~ ":30-64",
    TRUE ~ term
  ))

# Note: exception for immigration attitudes because the reference category is 2020 in this model
int_Year <- dat_Plot %>%
  filter((facet == "Sorting" & term == "Variable") | (facet != "Sorting" & term == "Intercept")) %>%
  mutate(
    coef = 0,
    term = case_when(
      var != "Immigration" ~ "1996",
      TRUE ~ "2020"
    ),
    group = "Year"
  ) %>%
  mutate(term = case_when(
    facet == "Sorting" & var != "Immigration" ~ ":1996",
    TRUE ~ term
  )) %>%
  mutate(term = case_when(
    facet == "Sorting" & var == "Immigration" ~ ":2020",
    TRUE ~ term
  ))

# Now, combine these rows:
gss_Plot <- dat_Plot %>%
  bind_rows(
    int_Cohort, int_Age, int_Year
  )

# Compute mean divergence and Sorting values by model and row
# Note 1: we compute Sorting values based on "Variable" (main effect attitude) instead of "Intercept" (constant)
# Note 2: This command will take up to several minutes to compute
gss_Plot <- gss_Plot %>%
  group_by(var, controls, boot, facet) %>%
  rowwise() %>%
  mutate(value = case_when(
    group == "Year"   & facet != "Sorting" ~ coef + constant + meanCohort + meanAge + meanWhite + meanCollege + meanProt,
    group == "Age"    & facet != "Sorting" ~ coef + constant + meanCohort + meanYear + meanWhite + meanCollege + meanProt,
    group == "Cohort" & facet != "Sorting" ~ coef + constant + meanAge + meanYear + meanWhite + meanCollege + meanProt,
    group == "Year"   & facet == "Sorting" ~ coef + main_effect + meanCohort + meanAge + meanWhite + meanCollege + meanProt,
    group == "Age"    & facet == "Sorting" ~ coef + main_effect + meanCohort + meanYear + meanWhite + meanCollege + meanProt,
    group == "Cohort" & facet == "Sorting" ~ coef + main_effect + meanAge + meanYear + meanWhite + meanCollege + meanProt,
    TRUE ~ coef
  )) %>%
  ungroup() %>%
  as.data.frame()

# Specify z-values to compute the 95% associated confidence intervals
z_value <- qnorm(1 - (1 - 0.95) / 2)

# Compute confidence intervals
gss_Plot <- gss_Plot %>%
  mutate(
    lower_ci_value = value - z_value * se, 
    upper_ci_value = value + z_value *se
  )

# Define the desired order in which the facets should appear in the plot
desired_order <- c(
  "Ideological self-placement",
  "Social welfare",
  "Race and gender",
  "Culture and morality",
  "Immigration"
)

# Reorder the levels of var
gss_Plot <- gss_Plot %>% mutate(var = factor(var, levels = desired_order))

# Save output
save(gss_Plot, file = "D:/dv_rep/Datasets/gss_boot_Plot.Rdata")
